1 Kindara app and dataset

knitr::opts_chunk$set(echo = TRUE, cache = TRUE)

2 Data preparation

2.1 Load CSV, filter users, save Rdata files

Transform CSV into binary Rdata files

input_folder = paste0(IO$input_data, "Days/")
output_folder = paste0(IO$tmp_data,"Days_Rdata_from_csv/")
if(!dir.exists(output_folder)){dir.create(output_folder)}

files = list.files(input_folder)

cl = makeCluster(par$n_cores)
registerDoParallel(cl)

users = foreach(file = files, .combine = c) %dopar%{

  days = read.csv(paste0(input_folder,file), stringsAsFactors = FALSE) 
  
  # colnames
  colnames(days)[colnames(days) == "account"] = "user_id"
  
  # identifying
  users_with_pos_preg_tests = unique(days$user_id[which(days$preg_test == 1)])
 
  # formating
  days$date = as.Date(days$date)
  # transforming first_day into a logical
  days$first_day_o = days$first_day
  days$first_day = ifelse(days$first_day == "True", TRUE, FALSE)
  
  new_file_name = gsub("csv","Rdata",file)
  save(days, file = paste0(output_folder,new_file_name))

  return(users_with_pos_preg_tests)
}

stopImplicitCluster()

save(users, file = paste0(IO$tmp_data, "full_list_users_with_pos_preg_tests.Rdata"))

Create a user table from the list of users that ever logged a positive pregnancy test

users = data.frame(user_id = as.character(unique(users)), pos_preg_test = TRUE, stringsAsFactors = FALSE)
users = users[order(users$user_id),]

batch_size = min(par$max_batch_size, ceiling(nrow(users)/par$min_n_batches))
n_batches = ceiling(nrow(users)/batch_size)
users$batch = rep(1:n_batches, each = batch_size)[1:nrow(users)]

save(users, file = paste0(IO$output_data, "users.Rdata"))
ok = file.copy(from = paste0(IO$output_data, "users.Rdata"), to = paste0(IO$tmp_data, "users_with_pos_preg_tests.Rdata"), overwrite = TRUE)

Filter the days table and re-organize users into batches

input_folder = paste0(IO$tmp_data,"Days_Rdata_from_csv/")
tmp_folder = paste0(IO$tmp_data,"Days_filtered_split_batches/")
if(!dir.exists(tmp_folder)){dir.create(tmp_folder)}

files = list.files(input_folder)

cl = makeCluster(par$n_cores)
registerDoParallel(cl)

ok = foreach(file = files) %dopar%{

  load(paste0(input_folder,file), verbose = TRUE) 
  full_days = days
  
  # filtering
  full_days = full_days[full_days$user_id %in% users$user_id,]
  full_days$input_file_id = file
  
  # split by batches
  for(b in unique(users$batch[users$user_id %in% full_days$user_id])){
    days = full_days[full_days$user_id %in% users$user_id[users$batch == b],]
    days$batch = b
    save(days, file = paste0(tmp_folder,"batch_",b,"_",file))
  }
  
}
stopImplicitCluster()
input_folder = paste0(IO$tmp_data,"Days_filtered_split_batches/")
output_folder = paste0(IO$output_data,"Days/")
tmp_folder = paste0(IO$tmp_data, "Days_filtered/")
if(dir.exists(input_folder)){unlink(output_folder, recursive = TRUE);dir.create(output_folder)}
if(!dir.exists(tmp_folder)){dir.create(tmp_folder)}

files = list.files(input_folder)

input_files = foreach(b = unique(users$batch), .combine = rbind) %do%{
  
  cl = makeCluster(par$n_cores)
  registerDoParallel(cl)

  batch_files = files[grep(paste0("batch_",b,"_day"), files)]
  
  days = foreach(file = batch_files, .combine = rbind) %dopar%{
      load(paste0(input_folder,file), verbose = TRUE) 
      return(days)
  }
  stopImplicitCluster()
  
  
  save(days, file = paste0(output_folder,"days_",b,".Rdata"))
  file.copy(from = paste0(output_folder,"days_",b,".Rdata"), to = paste0(tmp_folder,"days_",b,".Rdata"), overwrite = TRUE)
  
  input_files = aggregate(input_file_id ~ user_id, days, function(x){paste0(unique(sort(x)),collapse = "|")})
  return(input_files)
}

save(input_files, file = paste0(IO$tmp_data, "input_files.Rdata"))
input_files = aggregate(input_file_id ~ user_id, input_files, function(x){paste0(unique(sort(x)),collapse = "|")})
users$input_files = input_files$input_file_id[match(users$user_id, input_files$user_id)]

save(users, file = paste0(IO$output_data,"users.Rdata"))
file.copy(from = paste0(IO$output_data,"users.Rdata"), to = paste0(IO$tmp_data,"users_with_original_file_id.Rdata"), overwrite = TRUE)
## [1] TRUE

2.2 Create a cycles table

Note: we cannot use the cycles table that Kindara provided because the account ids are not linked between the 3 tables.

# load(paste0(IO$output_data,"users.Rdata"), verbose = TRUE)

cycles_input_folder = paste0(IO$input_data,"Cycles/")
cycles_files = list.files(cycles_input_folder)

cycles = foreach(file  = cycles_files) %dopar%
{
  file  = cycles_files[3]
  cycles = read.csv(paste0(cycles_input_folder,file), stringsAsFactors = FALSE)
  dim(cycles)
  colnames(cycles)
  cycles$user_id = cycles$account
  j = which(cycles$user_id %in% users$user_id)
  length(j)
  return(cycles[j,])
}

dim(cycles)

We thus need to create the cycles from the days table by looking at which days have the flag first_day.

We will take this opportunity of going through the days table to

  • clean some columns (transform first_day into a logical)

  • remove duplicated rows in the days table

# load(paste0(IO$output_data,"users.Rdata"), verbose = TRUE)

days_input_folder = paste0(IO$output_data,"Days/")
days_files = list.files(days_input_folder)

cycles = foreach(file  = days_files, .combine = rbind) %dopar%
{
  load(paste0(days_input_folder,file), verbose = TRUE)
  colnames(days)
  dim(days)
  
  # checking for duplicated rows
  d = duplicated(days)
  j = which(d)
  if(length(j)>0){
    days = days[-j,]
  }
  
  # creating the cycles table
  cycles = days[days$first_day, c("user_id","date")]
  colnames(cycles)[which(colnames(cycles) == "date")] = "start_date"
  cycles = cycles[order(cycles$user_id, cycles$start_date),]  
  
  j = which(cycles$user_id %in% users$user_id)
  length(j)
  cycles = cycles[j,]

  return(cycles)
}

dim(cycles)
## [1] 83477     2
save(cycles, file = paste0(IO$output_data,"cycles.Rdata"))
file.copy(from = paste0(IO$output_data,"cycles.Rdata"), to = paste0(IO$tmp_data,"cycles_first_version.Rdata"), overwrite = TRUE)
## [1] TRUE

We create unique cycle ID in the cycle table

# load(paste0(IO$output_data,"users.Rdata"), verbose = TRUE)
cycles = cycles[order(cycles$user_id, cycles$start_date),]

cycles$cycle_nb = ave(rep(1, nrow(cycles)), cycles$user_id, FUN = cumsum)
cycles$cycle_id = paste0(cycles$user_id, "_" ,cycles$cycle_nb)

cycles$end_date = cycles$start_date[match(cycles$cycle_id, paste0(cycles$user_id,"_",cycles$cycle_nb-1))] - 1

cycles$cycle_length = as.numeric(cycles$end_date - cycles$start_date + 1)


save(cycles, file = paste0(IO$output_data,"cycles.Rdata"))
file.copy(from = paste0(IO$output_data,"cycles.Rdata"), to = paste0(IO$tmp_data,"cycles_with_nb_and_id.Rdata"), overwrite = TRUE)
## [1] TRUE

And associate each row of the days to a cycle

days_folder = paste0(IO$output_data,"Days/")
days_tmp_folder = paste0(IO$tmp_data,"Days_with_cycle_id/")
if(!dir.exists(days_tmp_folder)){dir.create(days_tmp_folder)}



cl = makeCluster(par$n_cores)
registerDoParallel(cl)

days_files = list.files(days_folder)

ok = foreach(file  = days_files) %dopar%
{
  load(paste0(days_folder,file), verbose = TRUE)
  colnames(days)
  dim(days)
  
  # take the part of cycles that matches with the days users
  j = which((cycles$user_id %in% unique(days$user_id)) & (!is.na(cycles$cycle_length)))
  cycles_sub = cycles[j,]
  
  # expand cycles for each day
  cycles_sub_exp = as.data.frame(lapply(cycles_sub, rep, cycles_sub$cycle_length))
  cycles_sub_exp$cycleday = ave(rep(1,nrow(cycles_sub_exp)), cycles_sub_exp$cycle_id, FUN =cumsum)
  cycles_sub_exp$date = cycles_sub_exp$start_date + (cycles_sub_exp$cycleday - 1)
  cycles_sub_exp$day_id = paste0(cycles_sub_exp$user_id, "_", cycles_sub_exp$date)
  
  
  # match days and cycles_sub_exp
  days$day_id =  paste0(days$user_id, "_", days$date)
  m = match(days$day_id, cycles_sub_exp$day_id)
  days$cycle_nb = cycles_sub_exp$cycle_nb[m]
  days$cycle_id = cycles_sub_exp$cycle_id[m]
  days$cycle_length = cycles_sub_exp$cycle_length[m]
  days$cycleday = cycles_sub_exp$cycleday[m]
  
  days$cycleday_from_end = days$cycleday - days$cycle_length - 1
  
  #d = duplicated(days[,c("user_id","date","cycleday")])
  #if(sum(d)>0){days = days[-which(d),]}
  
  save(days, file = paste0(days_folder, file))
  file.copy(from = paste0(days_folder, file), to = paste0(days_tmp_folder, file), overwrite = TRUE)

}

stopImplicitCluster()

2.3 Aggregated cycles variable

Now we can aggregate the days table to report useful information on the cycles table

  • aggregate to create the cycles table

    • user_id –v
    • cycle_id –v
    • cycle_nb –v
    • cycle_length –v
    • n_days_obs –v
    • day_last_obs [cycleday] –v
    • n_pos_preg_test –v
    • n_neg_preg_test –v
    • day_first_pos_preg_test [cycleday] –v
    • day_last_pos_preg_test [cycleday] –v
    • n_days_obs_after_first_pos_preg_test –v
    • last_preg_test (0, 1, -1) –v
    • preg_test_class (0 = no preg test; 1 = at least one positive preg test ; -1 = only negative preg tests)
    • n_tot_sex –v
    • n_prot_sex –v
    • n_unprot_sex –v
    • n_withdrawal –v
    • n_insemination –v
    • n_BBT –v
input_days_folder = paste0(IO$tmp_data,"Days_with_cycle_id/")
output_days_folder = paste0(IO$output_data,"Days/")

cl = makeCluster(par$n_cores)
registerDoParallel(cl)

days_files = list.files(input_days_folder)

cycles_agg = foreach(file  = days_files, .combine = rbind, .packages = c('plyr','dplyr')) %dopar%
{
  load(paste0(input_days_folder,file), verbose = TRUE)
  colnames(days)
  dim(days)
  
  # take this oppotunity to change the way preg tests are encoded
  days$preg_test_o = days$preg_test
  days$preg_test[days$preg_test == 2] = -1
  
  save(days, file = paste0(output_days_folder,file))
  
  # 
  cycles_agg = ddply(days, 
                     .(cycle_id), 
                     .parallel = TRUE,  # FALSE,  # TRUE
                     .fun = summarize,
                     cycle_length = min(cycle_length),
                     n_days_obs = lu(date),
                     last_obs_day = max(cycleday),
                     n_pos_preg_test = sum(preg_test == 1),
                     n_neg_preg_test = sum(preg_test == -1),
                     day_first_pos_preg_test = min( cycleday_from_end * (preg_test == 1), na.rm = TRUE),
                     day_last_pos_preg_test = max(cycleday * (preg_test == 1), na.rm = TRUE),
                     day_last_preg_test  = max(cycleday * (preg_test %in%  c(1,-1)), na.rm = TRUE),
                     n_tot_sex = sum(sex > 0, na.rm = TRUE),
                     n_prot_sex = sum(sex == 1, na.rm = TRUE),
                     n_unprot_sex =  sum(sex == 2, na.rm = TRUE),
                     n_withdrawal =  sum(sex == 3, na.rm = TRUE),
                     n_insemination = sum(sex == 4, na.rm = TRUE),
                     n_BBT = sum(!is.na(temperature), na.rm = TRUE))
  
  
   j = which(cycles_agg$day_first_pos_preg_test < 0)
   cycles_agg$day_first_pos_preg_test[j] = cycles_agg$cycle_length[j] + cycles_agg$day_first_pos_preg_test[j] + 1
   
   cycles_agg$n_pos_preg_test[is.na(cycles_agg$n_pos_preg_test)] = 0
   cycles_agg$n_neg_preg_test[is.na(cycles_agg$n_neg_preg_test)] = 0
   cycles_agg$day_first_pos_preg_test[is.infinite(cycles_agg$day_first_pos_preg_test)] = 0
   cycles_agg$day_last_pos_preg_test[is.infinite(cycles_agg$day_last_pos_preg_test)] = 0
   
   
   # n_days_obs_after_first_pos_preg_test
   days$day_first_pos_preg_test = cycles_agg$day_first_pos_preg_test[match(days$cycle_id, cycles_agg$cycle_id)]
   days$after_first_pos_preg_test = (days$day_first_pos_preg_test > 0) & (days$cycleday > days$day_first_pos_preg_test)
   
   cycles_agg2 = aggregate(after_first_pos_preg_test ~ cycle_id, days, sum, na.rm = TRUE )
   cycles_agg$n_days_obs_after_first_pos_preg_test = cycles_agg2$after_first_pos_preg_test[match(cycles_agg$cycle_id, cycles_agg2$cycle_id)]
   
    # last_preg_test
   days$day_last_preg_test = cycles_agg$day_last_preg_test[match(days$cycle_id, cycles_agg$cycle_id)]
   cycles_agg2 = days[which(days$cycleday == days$day_last_preg_test),]
   cycles_agg$last_preg_test = cycles_agg2$preg_test[match(cycles_agg$cycle_id, cycles_agg2$cycle_id)]
   cycles_agg$last_preg_test[is.na(cycles_agg$last_preg_test)]= 0
   
   # preg_test_class
   #cycles_agg$preg_test_class = ifelse(cycles_agg$n_pos_preg_test>0,ifelse(cycles_agg$last_preg_test == 1, "pregnant","pregnancy loss"), ifelse(cycles_agg$n_neg_preg_test>0,"not pregnant", "not tested"))
   cycles_agg$preg_test_class = ifelse(cycles_agg$n_pos_preg_test>0,"pregnant", ifelse(cycles_agg$n_neg_preg_test>0,"not pregnant", "not tested"))
   
   
   return(cycles_agg)

}

stopImplicitCluster()

save(cycles_agg, file = paste0(IO$tmp_data, "cycles_agg.Rdata"))
column_names = colnames(cycles_agg)
column_names = column_names[-which(column_names %in% colnames(cycles))]
m = match(cycles$cycle_id, cycles_agg$cycle_id)
for(column  in column_names){
  eval(parse(text = paste0("cycles$",column,"= cycles_agg$",column,"[m]")))
  #eval(parse(text = paste0("cycles$",column,"[is.na(cycles$",column,")]= 0")))
}

save(cycles, file = paste0(IO$output_data,"cycles.Rdata"))
file.copy(from = paste0(IO$output_data,"cycles.Rdata"), to = paste0(IO$tmp_data,"cycles_with_agg.Rdata"), overwrite = TRUE)
## [1] TRUE

2.4 Filtering users

  • with at least 4 cycles before first positive pregnancy test (1st positive preg test in cycle 5+)
  • none of these cycles should be shorter than 15 days
  • with at least 2 cycles or 20+ days with observations after the first positive preg test (from cycle 6)
#load(paste0(IO$tmp_data,"users_with_original_file_id.Rdata"),verbose = TRUE)

users_agg = suppressWarnings(
  ddply(cycles, 
        .(user_id), 
        .fun = summarize,
        n_cycles = max(cycle_nb, na.rm = TRUE),
        n_days_obs = sum(n_days_obs, na.rm = TRUE),
        n_pos_cycles = sum(n_pos_preg_test > 0, na.rm = TRUE),
        first_cycle_preg = min(cycle_nb[n_pos_preg_test > 0], na.rm = TRUE))
)

users_agg$first_cycle_preg[is.infinite(users_agg$first_cycle_preg)] =  0

# n_obs_after_first_preg
cycles_tmp = cycles
cycles_tmp$first_cycle_preg = users_agg$first_cycle_preg[match(cycles_tmp$user_id, users_agg$user_id)]
users_agg2 = aggregate(n_days_obs ~ user_id, cycles_tmp[cycles_tmp$cycle_nb > cycles_tmp$first_cycle_preg,  ], sum, na.rm = TRUE)

users_agg$n_days_obs_after_first_preg = users_agg2$n_days_obs[match(users_agg$user_id, users_agg2$user_id)]
users_agg$n_days_obs_after_first_preg [is.na(users_agg$n_days_obs_after_first_preg )] = 0

users_agg$n_cycles_after_first_preg = users_agg$n_cycles - users_agg$first_cycle_preg


# minimal cycle length before the first positive preg test
users_agg2 = aggregate(cycle_length ~ user_id, 
                       cycles_tmp[cycles_tmp$cycle_nb < cycles_tmp$first_cycle_preg,  ], 
                       min, na.rm = TRUE)

users_agg$shortest_cycle_before_first_pos_preg = users_agg2$cycle_length[match(users_agg$user_id, users_agg2$user_id)]



# adding new columns to the users table

column_names = colnames(users_agg)
column_names = column_names[-which(column_names %in% colnames(users))]
m = match(users$user_id, users_agg$user_id)
for(column  in column_names){
  eval(parse(text = paste0("users$",column,"= users_agg$",column,"[m]")))
}

save(users, file = paste0(IO$output_data,"users.Rdata"))
file.copy(from = paste0(IO$output_data,"users.Rdata"), to = paste0(IO$tmp_data,"users_with_agg.Rdata"), overwrite = TRUE)
## [1] TRUE
j = which(
  (users$first_cycle_preg >= 5)&
    (users$shortest_cycle_before_first_pos_preg > 15) &
    ((users$n_cycles_after_first_preg >=2) | (users$n_days_obs_after_first_preg >= 20)))

# users
users = users[j,]
save(users, file = paste0(IO$output_data,"users.Rdata"))
file.copy(from = paste0(IO$output_data,"users.Rdata"), to = paste0(IO$tmp_data,"users_filtered.Rdata"), overwrite = TRUE)
## [1] TRUE
# cycles
cycles = cycles[which(cycles$user_id %in% users$user_id),]
save(cycles, file = paste0(IO$output_data,"cycles.Rdata"))
file.copy(from = paste0(IO$output_data,"cycles.Rdata"), to = paste0(IO$tmp_data,"cycles_filtered.Rdata"), overwrite = TRUE)
## [1] TRUE
# days
days_folder = paste0(IO$output_data,"Days/")

cl = makeCluster(par$n_cores)
registerDoParallel(cl)

days_files = list.files(days_folder)

days = foreach(file  = days_files, .combine = rbind) %dopar%
{
  load(paste0(days_folder,file), verbose = TRUE)
  colnames(days)
  dim(days)
  j = which(days$user_id %in% users$user_id)
  days = days[j,]
  return(days)
}

stopImplicitCluster()

save(days, file = paste0(IO$output_data,"days.Rdata"))
file.copy(from = paste0(IO$output_data,"days.Rdata"), to = paste0(IO$tmp_data,"days_filtered.Rdata"), overwrite = TRUE)
## [1] TRUE

2.5 Fill the users table

  • aggregate

    • avg, median and sd of cycle_length (cycles without positive pregnancy tests)
    • avg, median and sd of cycle_length (cycles before first positive pregnancy tests)
#load(paste0(IO$tmp_data,"users_with_original_file_id.Rdata"),verbose = TRUE)
cycles_tmp = cycles_tmp[cycles_tmp$user_id %in% users$user_id,]

users_agg = suppressWarnings(
  ddply(cycles_tmp, 
        .(user_id), 
        .fun = summarize,
        cycle_length_no_preg_avg = mean(cycle_length[n_pos_preg_test == 0], na.rm = TRUE),
        cycle_length_no_preg_median = median(cycle_length[n_pos_preg_test == 0], na.rm = TRUE),
        cycle_length_no_preg_sd = sd(cycle_length[n_pos_preg_test == 0], na.rm = TRUE),
        cycle_length_before_preg_avg = mean(cycle_length[cycle_nb < first_cycle_preg], na.rm = TRUE),
        cycle_length_before_preg_median = median(cycle_length[cycle_nb < first_cycle_preg], na.rm = TRUE),
        cycle_length_before_preg_sd = sd(cycle_length[cycle_nb < first_cycle_preg], na.rm = TRUE))
)


column_names = colnames(users_agg)
column_names = column_names[-which(column_names %in% colnames(users))]
m = match(users$user_id, users_agg$user_id)
for(column  in column_names){
  eval(parse(text = paste0("users$",column,"= users_agg$",column,"[m]")))
}

save(users, file = paste0(IO$output_data,"users.Rdata"))
file.copy(from = paste0(IO$output_data,"users.Rdata"), to = paste0(IO$tmp_data,"users_with_cycle_length_stats.Rdata"), overwrite = TRUE)
## [1] TRUE

3 Data presentation

3.1 Number of users, or cycles, of positive and negative pregnancy tests

3.2 Example of users data

knitr::opts_chunk$set(echo = TRUE, cache = TRUE)

4 Pregnancy losses

4.1 Histogram of cycle length

  • for cycles with positive pregnancy tests, with only negative pregnancy tests and with no pregnancy tests.

  • thresholds for EPL (Early Pregnancy loss), LPL_P (Late Pregnancy loss or premature birth), LB (live birth) and BF (breast feeding)

load(paste0(IO$output_data, "users.Rdata"), verbose = TRUE)
## Loading objects:
##   users
load(paste0(IO$output_data, "cycles.Rdata"), verbose = TRUE)
## Loading objects:
##   cycles
load(paste0(IO$output_data, "days.Rdata"), verbose = TRUE)
## Loading objects:
##   days
g = ggplot(cycles, aes(x = cycle_length, fill = preg_test_class)) + 
  geom_histogram(aes(y = ..density..), binwidth = 1, position = "identity")+
  xlim(c(0,750))+
  facet_grid(preg_test_class ~ .)
g
## Warning: Removed 1502 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).

g = ggplot(cycles[cycles$preg_test_class != "pregnant",], aes(x = cycle_length, fill = preg_test_class)) + 
  geom_histogram(aes(y = ..density..), binwidth = 1, position = "identity", alpha = 0.5)+
  xlim(c(0,150))
  #facet_grid(preg_test_class ~ .)
g
## Warning: Removed 1735 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).

g_hist_lt = ggplot(cycles, aes(x = cycle_length, fill = preg_test_class)) + 
  geom_vline(xintercept = dict$pregnancy_timeline$duration_in_days, col = "gray", linetype = 2)+
  geom_histogram(aes(y = ..density..), binwidth = 7, position = "identity", alpha = 0.5)+
  scale_x_continuous(breaks = viz$xaxis_m*28, minor_breaks = viz$xaxis_s*7,  labels = viz$xaxis_m*4, limits = c(0,20*28))+
  ylab("% of cycles")+ xlab("cycle or pregnancy duration (in weeks)")+
  scale_fill_discrete(name = "Cycle label")+ theme(legend.position = "bottom")
g_hist_lt
## Warning: Removed 1672 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).

g_hist_st = ggplot(cycles, aes(x = cycle_length, fill = preg_test_class)) +   
  geom_vline(xintercept = dict$pregnancy_timeline$duration_in_days, col = "gray", linetype = 2)+
  geom_histogram(aes(y = ..density..), binwidth = 1, position = "identity", alpha = 0.5)+
  scale_x_continuous(breaks = viz$xaxis_m*28, minor_breaks = viz$xaxis_s*7,  labels = viz$xaxis_m*4, limits = c(0,150))+
  ylab("% of cycles")+ xlab("cycle or pregnancy duration (in weeks)")+
  guides(fill = FALSE)
  #facet_grid(preg_test_class ~ .)
g_hist_st
## Warning: Removed 2572 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing missing values (geom_vline).
## Warning: Removed 6 rows containing missing values (geom_bar).

g_inset = ggplotGrob(g_hist_st +
                  theme(plot.background = element_rect(colour = "gray40")))
## Warning: Removed 2572 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing missing values (geom_vline).
## Warning: Removed 6 rows containing missing values (geom_bar).
g_hist_lt + annotation_custom(
    grob = g_inset,
    xmin = 24*7,
    xmax = Inf,
    ymin = 0.03,
    ymax = Inf
  )
## Warning: Removed 1672 rows containing non-finite values (stat_bin).

## Warning: Removed 6 rows containing missing values (geom_bar).

cycles$preg_outcome = factor(cycles$preg_test_class, 
                             levels = c(dict$pregnancy_timeline$abbreviation,
                                        unique(cycles$preg_test_class)))
j = which(cycles$preg_test_class == "pregnant")

cycles$preg_outcome[j] = cut(cycles$cycle_length[j], 
                             breaks = c(0,dict$pregnancy_timeline$duration_in_days), 
                             labels = as.character(dict$pregnancy_timeline$abbreviation))


counts = table(cycles$preg_outcome[j])

table(cycles$preg_outcome[j])/sum(cycles$preg_outcome %in% c("EPL","LPL","ExPTB","PTB","TB","BF"))
## 
##      FP|VEPL          EPL          LPL        ExPTB          PTB 
##   0.37703583   0.34690554   0.12540717   0.01547231   0.01710098 
##           TB           BF      unclear   not tested     pregnant 
##   0.11156352   0.38355049   0.15228013   0.00000000   0.00000000 
## not pregnant 
##   0.00000000
j = which(cycles$preg_test_class == "pregnant")

ggplot(cycles[j,], aes(x = preg_outcome, fill = preg_outcome)) + 
  geom_bar(aes(y = (..count..)/sum(..count..)))+
  xlab("Pregnancy outcome")+ ylab("% cycles")+
  scale_fill_manual(values = dict$pregnancy_timeline$colors)+
  scale_y_continuous(labels=percent)+
  guides(fill = FALSE)

save(cycles, file = paste0(IO$output_data, "cycles.Rdata"))
file.copy(from = paste0(IO$output_data, "cycles.Rdata"), to = paste0(IO$tmp_data, "cycles_with_preg_outcome.Rdata"), overwrite = TRUE)
## [1] TRUE
users_preg_outcome =  ddply(cycles, 
                            .(user_id), 
                            .fun = summarize,
                            n_preg = sum(preg_test_class == "pregnant", na.rm = TRUE),
                            n_PL = sum(preg_outcome %in% c("EPL","LPL"), na.rm = TRUE),
                            n_LB = sum(preg_outcome %in% c("ExPTB","PTB","TB","BF"), na.rm = TRUE)
                            
)

table(users_preg_outcome$n_LB, users_preg_outcome$n_PL)
##    
##       0   1   2   3
##   0 412 395  33   4
##   1 496  86   9   0
##   2  24   3   0   0
##   3   1   0   0   0
table(users_preg_outcome$n_LB)
## 
##   0   1   2   3 
## 844 591  27   1
table(users_preg_outcome$n_PL)
## 
##   0   1   2   3 
## 933 484  42   4
j = which((users_preg_outcome$n_LB + users_preg_outcome$n_PL)>0)
table(users_preg_outcome$n_LB[j], users_preg_outcome$n_PL[j])
##    
##       0   1   2   3
##   0   0 395  33   4
##   1 496  86   9   0
##   2  24   3   0   0
##   3   1   0   0   0
table(users_preg_outcome$n_LB[j])
## 
##   0   1   2   3 
## 432 591  27   1
round(table(users_preg_outcome$n_LB[j])/sum(table(users_preg_outcome$n_LB[j])) * 100, 2)
## 
##     0     1     2     3 
## 41.10 56.23  2.57  0.10
table(users_preg_outcome$n_PL[j])
## 
##   0   1   2   3 
## 521 484  42   4
round(table(users_preg_outcome$n_PL[j])/sum(table(users_preg_outcome$n_PL[j])) * 100, 2)
## 
##     0     1     2     3 
## 49.57 46.05  4.00  0.38
dim(users)
## [1] 1463   17
dim(users_preg_outcome)
## [1] 1463    4
colnames = colnames(users_preg_outcome[,2:ncol(users_preg_outcome)])
m = match(users$user_id, users_preg_outcome$user_id)
for(colname in colnames){
  eval(parse(text = paste0("users$",colname," = users_preg_outcome$",colname,"[m]")))
}

save(users, file = paste0(IO$output_data, "users.Rdata"))
file.copy(from = paste0(IO$output_data, "users.Rdata"), to = paste0(IO$tmp_data, "users_with_preg_outcome.Rdata"), overwrite = TRUE)
## [1] TRUE

5 Predictors of pregnancy losses

5.1 Is cycle length predictive of pregnancy outcomes

load(file = paste0(IO$output_data,"users.Rdata"), verbose = TRUE)
## Loading objects:
##   users
users$any_PL = ifelse(users$n_PL>0, TRUE, ifelse(users$n_LB>0,FALSE,NA))

u = users[which(!is.na(users$any_PL)),]


glm_cl = glm(
  any_PL ~ cycle_length_before_preg_avg + 
    cycle_length_before_preg_median + 
    cycle_length_before_preg_sd,
  data = u,
  family = "binomial")
summary(glm_cl)
## 
## Call:
## glm(formula = any_PL ~ cycle_length_before_preg_avg + cycle_length_before_preg_median + 
##     cycle_length_before_preg_sd, family = "binomial", data = u)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8277  -1.1627   0.8129   1.1861   1.3780  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                     -0.722218   0.361196  -2.000   0.0456 *
## cycle_length_before_preg_avg     0.015717   0.016355   0.961   0.3365  
## cycle_length_before_preg_median  0.008166   0.018043   0.453   0.6509  
## cycle_length_before_preg_sd     -0.004157   0.007254  -0.573   0.5666  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1456.9  on 1050  degrees of freedom
## Residual deviance: 1449.3  on 1047  degrees of freedom
## AIC: 1457.3
## 
## Number of Fisher Scoring iterations: 4
glm_cl = glm(
  any_PL ~ cycle_length_before_preg_median,
  data = u,
  family = "binomial")

summary(glm_cl)
## 
## Call:
## glm(formula = any_PL ~ cycle_length_before_preg_median, family = "binomial", 
##     data = u)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9070  -1.1705   0.9551   1.1843   1.2463  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                     -0.71296    0.35639  -2.001   0.0454 *
## cycle_length_before_preg_median  0.02402    0.01157   2.076   0.0379 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1456.9  on 1050  degrees of freedom
## Residual deviance: 1452.3  on 1049  degrees of freedom
## AIC: 1456.3
## 
## Number of Fisher Scoring iterations: 4
glm_cl = glm(
  any_PL ~ cycle_length_before_preg_sd,
  data = u,
  family = "binomial")

summary(glm_cl)
## 
## Call:
## glm(formula = any_PL ~ cycle_length_before_preg_sd, family = "binomial", 
##     data = u)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5157  -1.1709   0.8439   1.1832   1.1874  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                 -0.024834   0.066305  -0.375   0.7080  
## cycle_length_before_preg_sd  0.003201   0.001897   1.688   0.0914 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1456.9  on 1050  degrees of freedom
## Residual deviance: 1453.8  on 1049  degrees of freedom
## AIC: 1457.8
## 
## Number of Fisher Scoring iterations: 4
ggplot(u, aes(x = cycle_length_before_preg_avg, fill = any_PL))+
  geom_histogram(col = NA, alpha = 0.5, position = "identity", binwidth = 1)

ggplot(u, aes(x = cycle_length_before_preg_median, fill = any_PL))+
  geom_histogram(col = NA, alpha = 0.5, position = "identity", binwidth = 1)

ggplot(u, aes(x = cycle_length_before_preg_median, fill = any_PL))+
  geom_density(col = NA, alpha = 0.5, bw = 2)

ggplot(u, aes(x = log(cycle_length_before_preg_sd), fill = any_PL))+
  geom_histogram(col = NA, alpha = 0.5, position = "identity", binwidth = 1)
## Warning: Removed 2 rows containing non-finite values (stat_bin).

5.2 Is something else predictive of pregnancy outcomes?

5.2.1 Detect ovulation in cycles without positive pregnancy tests

  • Use HMM model (adapt it to include prior knowledge and LH tests)

  • Label days table

  • Aggregate cycles table with

    • ovulation day + uncertainty

    • temperature shift

  • Aggregate users table with

    • average, minimum and maximum temperature shift for cycles before first positive preg test

6 Examples of user tracking history with different pregnancy outcomes

Loading users and days tables.

load(file = paste0(IO$output_data,"users.Rdata"), verbose = TRUE)
## Loading objects:
##   users
load(file = paste0(IO$output_data,"days.Rdata"), verbose = TRUE)
## Loading objects:
##   days

6.1 Users with pregnancy loss

j = which((users$n_PL == 1) & (users$n_preg == 1) & (users$n_cycles %in% 15:20) & (users$n_days_obs %in% 300:400))

for(user in users$user_id[j]){
  cat("\n",user, "\n")
  d = days[which((days$user_id == user) & (days$cycle_nb >= 2)),]
  plot.tracking.history(d = d, show_tests = TRUE, average_temp = FALSE)
}
## 
##  05def79e901d45508c67bcfd44a33ddd

## 
##  507abdf43cdd491abdb3a32e65923928

## 
##  6e55b1daeae34508b9f231f2b873f537

## 
##  70d07733a9884416b056b5f819be9094

## 
##  9904052a9836407abf2f569e157c1a42

## 
##  aa1caffaa60c4a6086cd3db9bfecd5bd

## 
##  aa6193a3ce84413196177052fe87f493

## 
##  e66dd8871c6b442b927a07d73959bf4e

user_ids = c("70d07733a9884416b056b5f819be9094",
             "aa1caffaa60c4a6086cd3db9bfecd5bd",
             "aa6193a3ce84413196177052fe87f493")

6.2 Users with LB

j = which((users$n_LB == 1) & (users$n_preg == 1) & (users$n_cycles %in% 15:20) & (users$n_days_obs %in% 300:400))

for(user in users$user_id[j]){
  cat("\n",user, "\n")
  d = days[which((days$user_id == user) & (days$cycle_nb >= 2)),]
  plot.tracking.history(d = d, show_tests = TRUE, average_temp = FALSE)
}
## 
##  04251d0121874e29bf52d1428b67efa7

## 
##  1552e3fc56dc48ba977d69ed2a987841

## 
##  215b7ff9f2ed494cb7a17d4d4476270d

## 
##  3b4d245e51b94be4ac1fdbe0251ba1b1

## 
##  3b8f6fb7c38545fda33c5461bba340e5

## 
##  3ee8da51d47e4c178f739450b57d69f2

## 
##  49e5591bfe344e68aefda28a19bf792c

## 
##  5524de1daa6e4dc598e2c9fb32ff0f48

## 
##  6fa2b5df2c074a7da88a01207bed4352

## 
##  78b74e2cdfc2417096ad5f9d56132305

## 
##  7fcc75a22714449286218b08e69baeeb

## 
##  a43a65171a604f9fab859ce5c9ce2168

## 
##  c170dd91b6004d8283bd715941655019

## 
##  ce4c161eba2449d8a64aa3cba78e0bbb

## 
##  d9979a69c1bd4cd2956d649cac6cdb61

user_ids = c(user_ids, 
             "1552e3fc56dc48ba977d69ed2a987841",
             "3b8f6fb7c38545fda33c5461bba340e5",
             "3ee8da51d47e4c178f739450b57d69f2",
             "6fa2b5df2c074a7da88a01207bed4352",
             "c170dd91b6004d8283bd715941655019")

7 Selected users

for(user in user_ids){
  cat("\n",user, "\n")
  d = days[which((days$user_id == user) & (days$cycle_nb >= 2)),]
  plot.tracking.history(d = d, show_tests = TRUE, average_temp = FALSE)
}
## 
##  70d07733a9884416b056b5f819be9094

## 
##  aa1caffaa60c4a6086cd3db9bfecd5bd

## 
##  aa6193a3ce84413196177052fe87f493

## 
##  1552e3fc56dc48ba977d69ed2a987841

## 
##  3b8f6fb7c38545fda33c5461bba340e5

## 
##  3ee8da51d47e4c178f739450b57d69f2

## 
##  6fa2b5df2c074a7da88a01207bed4352

## 
##  c170dd91b6004d8283bd715941655019

user_ids = c("70d07733a9884416b056b5f819be9094",
             "c170dd91b6004d8283bd715941655019",
             "3b8f6fb7c38545fda33c5461bba340e5")
user_ids = c("70d07733a9884416b056b5f819be9094",
             "c170dd91b6004d8283bd715941655019",
             "3b8f6fb7c38545fda33c5461bba340e5")

days = days[which(days$user_id %in% user_ids),]
save(days, file = paste0(IO$tmp_data,"days_selected_users.Rdata"))